6  Load packages and define base directory

from pathlib import Path
import os
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
# Define base_dir for consistent path management
notebook_dir = Path(os.getcwd()).resolve()
base_dir = notebook_dir.parent
print(base_dir)
/home/cgoehler/team-extra/ndvi-time-series-prediction

7 Analysing prediction results and comparing models

As mentioned in the Data Preprocessing chapter (Challenges in Modelling) the criterion for selection of cubes was the completenes of the time series. The four cubes with the fewest measurement and time gaps were chosen (I: Cube 665, II: Cube 80, III: Cube 1203, IV: Cube 1301). For comparison we plot the NDVI time series original data with the prediction results of the different models for one example pixel per cube. The example pixel is chosen by the minimum test MSE of the SARIMA model for each cube, as this model seems to capture the course of the NDVI time series the best. The MSE is calculated with:

\(MSE = \frac{1}{n} \sum_{i=1}^{n}(Y_i - \bar Y_i)^2\)

With - \(Y_i\) … observed value - \(\bar Y_i\) … predicted value

# Function to load data from NetCDF file and defining variables
def load_nc_file(file_path, ndvi_var_name, x_dim_name, y_dim_name, mse_name=""):
    ds = xr.open_dataset(file_path)
    try:
        ndvi = ds[ndvi_var_name]
        times = ds['time']
        x = ds[x_dim_name] # lat for lstm
        y = ds[y_dim_name] # lon for lstm
        mse = ds[mse_name]
        return ndvi, times, x, y, mse
    except:
        ndvi = ds[ndvi_var_name]
        times = ds['time']
        x = ds[x_dim_name] # lat for lstm
        y = ds[y_dim_name] # lon for lstm
        return ndvi, times, x, y
# Function to determine the pixel with the lowest test mse (SARIMA) in each cube
def find_min_test_mse(datasets, cube_names):
    """
    Find and print the pixel with the lowest test MSE in each dataset.
    
    Parameters:
    datasets (list of str): List of paths to the NetCDF datasets.
    cube_names (list of str): List of names for each cube corresponding to the datasets.
    """
    for i, dataset_path in enumerate(datasets):
        pred = xr.open_dataset(dataset_path)
        test_mse_values = pred['test_mse'].values
        min_test_mse_value = np.nanmin(test_mse_values)
        min_test_mse_index = np.unravel_index(np.nanargmin(test_mse_values), test_mse_values.shape)
        min_test_mse_coords = (pred['x'][min_test_mse_index[0]].item(), pred['y'][min_test_mse_index[1]].item())

        print(f"The minimum test MSE for {cube_names[i]} is: {min_test_mse_value} for pixel ({min_test_mse_coords})")

# Paths to the datasets
datasets = [
    base_dir / "data" / "data_predictions" / f'SARIMA_predicted_ds_B_Cube_{cube}_train.nc'
    for cube in [665, 80, 1203, 1301]
]

# Names of the cubes
cube_names = [
    'Cube 665',
    'Cube 80',
    'Cube 1203',
    'Cube 1301'
]

# Call the function
find_min_test_mse(datasets, cube_names)
The minimum test MSE for Cube 665 is: 0.0006814400117864885 for pixel ((93, 32))
The minimum test MSE for Cube 80 is: 0.003545171194203701 for pixel ((46, 84))
The minimum test MSE for Cube 1203 is: 0.0006025049006614665 for pixel ((49, 42))
The minimum test MSE for Cube 1301 is: 0.0058360315822941905 for pixel ((122, 106))
# cubes considered
cubes = [665, 80, 1203, 1301]
# pixel with best MSE for SARIMA for each cube
best_xy = [[93, 32], [46, 84], [49, 42], [122, 106] ]

lstm_cube_mse_list = []
sarima_cube_mse_list = []
lstm_pixel_mse_list = []
sarima_pixel_mse_list = []
for i, cube in enumerate(cubes):
    # directory to each cube
    directory = base_dir / "data"

    # file names of training and testing data as well as predictions by LSTM and SARIMA model
    train_nc = directory / f'data_train/ds_B_Cube_{cube}_train.nc'
    test_nc = directory / f'data_test/Cube_{cube}_test.nc'
    lstm_nc = directory / f'data_predictions/LSTM_predicted_Cube_{cube}_test.nc'
    sarima_nc = directory / f'data_predictions/SARIMA_predicted_ds_B_Cube_{cube}_train.nc'
    if cube == 665:
        rf_nc = directory / 'data_predictions/Random_Forest_Cube_665.nc'
        rf_ndvi, rf_times, rf_x, rf_y, rf_mse = load_nc_file(rf_nc, 'pred_ndvi', 'x', 'y', 'mse')
        rf_ndvi.where(rf_ndvi==0.0)

    # import datasets and define variables
    train_ndvi, train_times, train_x, train_y = load_nc_file(train_nc, 'NDVI', 'x', 'y')
    test_ndvi, test_times, test_x, test_y = load_nc_file(test_nc, 'NDVI', 'x', 'y')
    lstm_ndvi, lstm_times, lstm_x, lstm_y, lstm_mse = load_nc_file(lstm_nc, 'NDVI', 'x', 'y', 'MSE')
    sarima_ndvi, sarima_times, sarima_x, sarima_y, sarima_mse = load_nc_file(sarima_nc, 'predictions', 'x', 'y', 'test_mse')

    x = best_xy[i][0]
    y = best_xy[i][1]

    # plot time series of training, testing and model data
    fig, ax = plt.subplots(1, 1, figsize=(15, 7))


    if cube == 665:
        plt.plot(train_times, train_ndvi[:,x,y],
             label='interpolated train NDVI')
        plt.plot(test_times, test_ndvi[:,x,y],
             label='test NDVI')
        plt.plot(lstm_times, lstm_ndvi[:,x,y],
             label='LSTM NDVI')
        plt.plot(sarima_times, sarima_ndvi[x,y,:],
             label='SARIMA NDVI')
        rf_ndvi[:,x,y][np.where(rf_ndvi[:,x,y]==0.)] = np.nan
        plt.plot(test_times, rf_ndvi[:,x,y],
                 label='Random Forest NDVI')
        plt.legend()

    else:
        plt.plot(train_times, train_ndvi[:,x,y],
             label='interpolated train NDVI')
        plt.plot(test_times, test_ndvi[:,x,y],
             label='test NDVI')
        plt.plot(lstm_times, lstm_ndvi[:,x,y],
             label='LSTM NDVI')
        plt.plot(sarima_times, sarima_ndvi[x,y,:],
             label='SARIMA NDVI')
        plt.legend()


    # calculate pixel MSE for LSTM
    mask = np.isfinite(test_ndvi[:,x,y])
    mask = mask[0:len(lstm_times)]
    lstm_pixel_mse = mean_squared_error(test_ndvi[:,x,y][0:len(lstm_times)][mask],
                                        lstm_ndvi[:,x,y][mask])
    print(f'Cube {cube} \nPixel x = {x}, y = {y}')
    print(f'Pixel MSE from SARIMA: {sarima_mse[x,y].values}')
    print(f'Cube MSE from SARIMA: {sarima_mse.mean().values}')
    print(f'Pixel MSE from LSTM: {lstm_pixel_mse}')
    print(f'Cube MSE from LSTM: {lstm_mse.values}')
    if cube == 665:
        print(f'Pixel MSE from Random Forest: {rf_mse[x,y].values}')
        print(f'Cube MSE from Random Forest: {rf_mse.mean().values}')

    # append cube MSE
    lstm_cube_mse_list.append(lstm_mse)
    sarima_cube_mse_list.append(sarima_mse.mean().values)
    # append pixel MSE
    lstm_pixel_mse_list.append(lstm_pixel_mse)
    sarima_pixel_mse_list.append(sarima_mse[x,y].values)

    plt.show()
Cube 665 
Pixel x = 93, y = 32
Pixel MSE from SARIMA: 0.0006814400117864885
Cube MSE from SARIMA: 0.007087799911625777
Pixel MSE from LSTM: 0.0004515084729064256
Cube MSE from LSTM: 0.13620756566524506
Pixel MSE from Random Forest: 0.005258710123598576
Cube MSE from Random Forest: 0.01674204133450985

Cube 80 
Pixel x = 46, y = 84
Pixel MSE from SARIMA: 0.003545171194203701
Cube MSE from SARIMA: 0.05865153656459794
Pixel MSE from LSTM: 0.1886572241783142
Cube MSE from LSTM: 0.20980994403362274

Cube 1203 
Pixel x = 49, y = 42
Pixel MSE from SARIMA: 0.0006025049006614665
Cube MSE from SARIMA: 0.059179499826998834
Pixel MSE from LSTM: 0.0010537380585446954
Cube MSE from LSTM: 0.09221689403057098

Cube 1301 
Pixel x = 122, y = 106
Pixel MSE from SARIMA: 0.0058360315822941905
Cube MSE from SARIMA: 0.06264543209138485
Pixel MSE from LSTM: 0.005719645880162716
Cube MSE from LSTM: 0.2933070659637451

ADD description of plots

7.1 Description of model performance based on example pixels plot (visual comparison)

The prediction results for each model differ a lot.

The SARIMA model capture best the seasonality and trend of the NDVI time series for the 93 prediction steps. For the example pixels of cube 665 and 1203 the seasonality of the test data is followed by the SARIMA model predictions. For the example pixel of cube 80 the SARIMA model predictions fail in predicting the NDVI amplitude in the last months of 2021, but in general are able to follow the seasonality of the data. The SARIMA model predictions follow the NDVI time series for the prediction steps in the example pixel of cube 1301 besides the test data artefact around the turn of the year from 2021 to 2022.

The Random Forest NDVI predictions in cube 665 show strong fluctuations and higher deviations from the test NDVI data then the other models, indicating less accurate predictions. Due to processing time it was not able to run a Random Forest model for the other three cubes.

The LSTM model only predicted a short term period (23 time steps) for the NDVI time series. For the example pixels in cube 665 and 1301 the LSTM model predictions follow the course of the test data for the first 23 time steps. For the example pixels of cube 80 and 1203 the LSTM model predictions fail in predicting the

8 Test Mean Squared Error (MSE) Comparison for each cube

For model evaluation we choose the test Mean Squared Error (MSE), calcuated as the difference between the test and predicted data. The MSE is a widely recognized metric for evaluating the performance of predictive models (Arumugam and Natarajan 2023). We compare the mean test MSE over all pixels of a cube in a bar plot for all models.

# Positions of the bars on the x-axis
bar_width = 0.3
indices = np.arange(len(cubes))
rf_index = 0

# Create the bar plot
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

ax1 = plt.subplot(121)
# Bar positions for each model
bar1 = ax1.bar(indices, lstm_cube_mse_list, bar_width, label='LSTM')
bar3 = ax1.bar(indices + bar_width, sarima_cube_mse_list, bar_width, label='SARIMA')
# Adding the single MSE value as an additional bar
ax1.bar(rf_index + 2 * bar_width, rf_mse.mean().values, bar_width, label='Random Forest', color='m')

# Adding labels, title, and legend
ax1.set_xlabel('Cubes',
              fontsize=16,
              fontweight='semibold')
ax1.set_ylabel('MSE',
              fontsize=16,
              fontweight='semibold')
ax1.set_xticks(indices + bar_width /2)
ax1.set_xticklabels(cubes)
ax1.set_title('Cube MSE',
              fontsize=16,
              fontweight='semibold')
ax1.legend()

ax2 = plt.subplot(122)

# Bar positions for each model
bar2 = ax2.bar(indices, lstm_pixel_mse_list, bar_width, label='LSTM')
bar4 = ax2.bar(indices + bar_width, sarima_pixel_mse_list, bar_width, label='SARIMA')
# Adding the single MSE value as an additional bar
ax2.bar(rf_index + 2 * bar_width, rf_mse[x,y].values, bar_width, label='Random Forest', color='m')

# Adding labels, title, and legend
ax2.set_xlabel('Cubes',
              fontsize=16,
              fontweight='semibold')
ax2.set_ylabel('MSE',
              fontsize=16,
              fontweight='semibold')
ax2.set_xticks(indices + bar_width / 2)
ax2.set_xticklabels(cubes)
ax2.set_title('Pixel MSE',
              fontsize=16,
              fontweight='semibold')
ax2.legend()

# Display the plot
plt.show()

8.1 Description of model performance based on mean test MSE per cube and model

The test MSE of the LSTM model is higher for all cubes than the test MSE of the SARIMA model. That means that the SARIMA model performed better in predictions for all cubes than the LSTM model.

SARIMA performed best in predicting the NDVI time series of cube 665 with a test MSE of ~0.0071. The test MSE for cubes 80, 1203 and 1301 is varying around ~0.06 (with ~0.0587, ~0.0592 and ~0.0627 respectively).

LSTM performed best in predicting the NDVI time series of cube 1203 with a test MST of ~0.0922. The test MSE for cubes 665, 80 and 1301 is varying between 0.14 and 0.3 (with ~0.1362, ~0.2098 and ~0.2933 respectively).

8.2 Description of the models performance based on pixelwise test MSE per example pixel of each cube and model

add description

Schwierigkeiten bei Vergleich LSTM nur für viel kürzeren Zeitraum verfügbare predictions (Modell Konfiguration) RF nur für einen cube predictions verfügbar

beantwortung der research question & final conclusion compare three different models for calculating ndvi predictions from time series data.